require(tidyverse)
require(data.table)
require(lubridate)
require(stringr)
require(ggvis)
require(ggplot2)
# setwd(dir="D:\\Users\\gdrtmh\\Desktop\\Kaggle\\Jobs on Naukri")
inputdt <- as.data.table(read.csv(file="naukri_com-job_sample.csv", strip.white = TRUE,
na.strings=c("","NA","Not Mentioned","Not Disclosed by Recruiter"),
stringsAsFactors = FALSE))
# setting "Not mentioned" as NA for experience etc.
# setting "Not Disclosed by Recruiter" that is the norm for NA in payrate
glimpse(inputdt)
Observations: 22,000
Variables: 14
$ company <chr> "MM Media Pvt Ltd", "find live infotech", "Softtech Career Infosystem Pvt. Ltd",...
$ education <chr> "UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate - Any Specialization, ...
$ experience <chr> "0 - 1 yrs", "0 - 0 yrs", "4 - 8 yrs", "11 - 15 yrs", "6 - 8 yrs", "2 - 5 yrs", ...
$ industry <chr> "Media / Entertainment / Internet", "Advertising / PR / MR / Event Management", ...
$ jobdescription <chr> "Job Description  Send me Jobs like this Qualifications: - == > 10th To Gradua...
$ jobid <dbl> 210516002263, 210516002391, 101016900534, 81016900536, 120916002122, 13101600507...
$ joblocation_address <chr> "Chennai", "Chennai", "Bengaluru", "Mumbai, Bengaluru, Kolkata, Chennai, Coimbat...
$ jobtitle <chr> "Walkin Data Entry Operator (night Shift)", "Work Based Onhome Based Part Time."...
$ numberofpositions <int> NA, 60, NA, NA, 4, NA, 2, 20, 2, NA, NA, NA, 2, NA, NA, 2, 3, NA, NA, NA, NA, NA...
$ payrate <chr> "1,50,000 - 2,25,000 P.A", "1,50,000 - 2,50,000 P.A. 20000", NA, NA, NA, NA, "3,...
$ postdate <chr> "2016-05-21 19:30:00 +0000", "2016-05-21 19:30:00 +0000", "2016-10-13 16:20:55 +...
$ site_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ skills <chr> "ITES", "Marketing", "IT Software - Application Programming", "Accounts", "IT So...
$ uniq_id <chr> "43b19632647068535437c774b6ca6cf8", "d4c72325e57f89f364812b5ed5a795f0", "c47df6f...
summary(as.factor(inputdt$site_name))
www.naukri.com NA's
3987 18013
inputdt$site_name <- NULL
Well, it seems like the site name is pretty filled with either naukri or NA data. This column could probably be removed without influencing the subsequent data analysis.
uniqueN(inputdt$uniq_id)
[1] 22000
This Would be useful as an unique identifier for individual job listing.
uniqueN(inputdt, by=c("jobid"))
[1] 21910
It is not exactly clear how “jobid” is structed, perhaps it accounts of relisting of jobs?
Although, since the number of unique jobid is only differing from uniq_id by 90, if the purpose where to depict the relisting of respective jobs, it probably did not do its job very well.
# Seeing that payrate listing would corresponds to format of "1,50,000 - 2,50,000 P.A"
# I would just use gsub to remove strings after P.A
inputdt$payrate.M<-gsub( " P.A.*$", "", inputdt$payrate)
# this is essential to remove numerical characters after the P.A string
# this also removed certain listed incentive/commission information in payrate
# stripping all the non numerical chr from the payrate
inputdt$payrate.M<-gsub("[^0-9-]", "", inputdt$payrate.M)
# inputdt$m.payrate<-gsub("[^0-9,-,-]", "", inputdt$m.payrate)
# Outliers
# Example of some payrate that is not extracated properly ( with 2 "-" etc after regexp treatment),
# seeing there is no easy solution and that there is only ~200 of them, them will be ignored for now
inputdt[lengths(strsplit(inputdt$payrate.M, "-"))>2, list(payrate,payrate.M),]
COuld potentially improve the regular expresssion by extracting only numbers between the dashes, but that would require more work with regeexp, going to put it off for now.
Splitting the payrate column while conditionally ignoring those that have more than dash (“-”) symbol.
# Identifying the appropriate number of splits for payrate with str "-"
splits <- max(lengths(strsplit(inputdt$payrate.M, "-")))
# Spliting the payrate into m.payrate1 and m.payrate2, Ignoring those column with additional splits
mdt <- inputdt[lengths(strsplit(inputdt$payrate.M, "-"))<=2,
paste0("m.payrate", 1:2) := tstrsplit(payrate.M, "-", fixed=TRUE)][]
# Changing the column into numerics
mdt$m.payrate1<- as.numeric(mdt$m.payrate1)
mdt$m.payrate2<- as.numeric(mdt$m.payrate2)
# Creating a column that indicate the mean of the payrate, would be easier on plotting etc
mdt[, m.payrate.Mean := rowMeans(.SD), by = uniq_id, .SDcols = c("m.payrate1", "m.payrate2")]
# Checking if the values in experience1 are always lower than experience2,
# such that experience1 can be considered as a lower limit for the job, vice versa for experience2
# summary(mdt$m.payrate1>mdt$m.payrate2)
# Mode FALSE TRUE NA's
# logical 2 4743 17255
# listing the data with m.payrate1 > m.payrate2
mdt[m.payrate1>m.payrate2,list(payrate,payrate.M,m.payrate1,m.payrate2)]
In both of these cases, the payrate is not well formated and extracted. There is probably some bad extraction in the extracting of payrate that is not depicted here as well as they may be hidden behind m.payrate2 > m.payrate1.
Removing these cases (where m.payrate1>m.payrate2) by setting them to NA
mdt[(m.payrate1>m.payrate2),`:=`(payrate.M= NA,
m.payrate1= NA,
m.payrate2= NA,
m.payrate.Mean= NA)]
# The majority of the records has no payrate listed.
# summary(mdt$m.payrate.Mean)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 9.000e+00 2.500e+05 3.750e+05 3.672e+09 8.000e+05 1.740e+13 17255
# The maximum figure also points to likely error in extracation
# after attempting a few threshold of cut, I find 9e+6 to be most suitable, listed below are a table depicted payrate after this threshold.
mdt[m.payrate.Mean>9e+6] %>%
.[,list(payrate, m.payrate.Mean)]
In fact, it almost seems like the posting of payrate = “Pay Band: PB3 (Rs.15600-39100) with Academic Grade Pay of Rs.8,000/p. m.” are just reposting seeing about 80 listing of jobs are having identifical payrate.
mdt[m.payrate.Mean>9e+6,`:=`(payrate.M= NA,
m.payrate1= NA,
m.payrate2= NA,
m.payrate.Mean= NA)]
#total count of valid payrate records
N_valid_payrate<-sum(!is.na(mdt$m.payrate.Mean))
payrate_record_percentage <- sum(!is.na(mdt$m.payrate.Mean))/sum((mdt$m.payrate.Mean), na.rm=T)
Sadly, there is only payrate_record_percentage of the job listing that has its payrate listed and extracted so fr, that would be a mere N_valid_payrate records.
# all_values <- function(x) {
# if(is.null(x)) return(NULL)
# paste0(names(x), ": ", format(x), collapse = "<br />")
# }
mdt[!is.na(m.payrate.Mean),][m.payrate.Mean<9e+6] %>%
melt(., measure.vars = patterns("^m.payrate")) %>%
ggvis(~value, fill = ~variable) %>%
group_by(variable) %>%
layer_densities()
head(inputdt$skills)
[1] "ITES" "Marketing"
[3] "IT Software - Application Programming" "Accounts"
[5] "IT Software - Application Programming" "IT Software - Application Programming"
unique(inputdt$skills)
[1] "ITES" "Marketing"
[3] "IT Software - Application Programming" "Accounts"
[5] "Production" "Sales"
[7] "IT Software - Other" "Executive Assistant"
[9] "IT Software - Mobile" "Engineering Design"
[11] "Financial Services" "Hotels"
[13] "IT Software - QA & Testing" "HR"
[15] "Supply Chain" "IT Software - Network Administration"
[17] "Architecture" "Legal"
[19] "Site Engineering" "Journalism"
[21] NA "IT Software - DBA"
[23] "Strategy" "Medical"
[25] "Design" "Defence Forces"
[27] "IT Software - Mainframe" "IT Software - Telecom Software"
[29] "IT Software - Embedded" "IT Software - Middleware"
[31] "Teaching" "IT Software - System Programming"
[33] "IT Software - Client/Server Programming" "Travel"
[35] "IT Software - eCommerce" "TV"
[37] "Fashion Designing" "IT Software - ERP"
[39] "IT Hardware" "Analytics & Business Intelligence"
[41] "Beauty/Fitness/Spa Services" "Top Management"
[43] "Export" "IT Software - Systems"
[45] "Packaging" "Shipping"
inputdt[,.N,by=list(skills)] %>%
.[,.SD[order(-N)]] %>%
.[, head(.SD, 10),]%>%
ggvis(x=~skills, y=~N) %>%
layer_bars()%>%
add_axis("x", properties = axis_props(
labels = list(angle = 45, align = "left", fontSize = 10)
))
package <U+393C><U+3E31>bindrcpp<U+393C><U+3E32> was built under R version 3.3.3
# Checking whether every val id entry has "yrs" in the records,
summary(str_detect(mdt$experience, "yrs"))
Mode TRUE NA's
logical 21885 115
splits <- max(lengths(strsplit(mdt$industry, "/")))
mdt <- mdt[,paste0("industryS", 1:splits) := tstrsplit(industry, "/", fixed=TRUE)][]
melt(mdt, measure.vars = patterns("^industryS"))
# dt[!str_detect(dt$experience, "yrs")]# It seems all the listing are in a matter of years
# one entry with double dash "-"
# glimpse(dt[max(lengths(strsplit(dt$experience, "-")))==lengths(strsplit(dt$experience, "-"))])
mdt[max(lengths(strsplit(mdt$experience, "-")))==lengths(strsplit(mdt$experience, "-"))]$experience <- "1 - 3 yrs"
# Removing the last 3 char - "yrs" in the experience records
# nchar(mdt$experience, allowNA = TRUE)-3
mdt$experience.M <- str_sub(mdt$experience, 1,nchar(mdt$experience, allowNA = TRUE)-4)
# Splitting the experience
splits <- max(lengths(strsplit(mdt$experience.M, "-")))
mdt <- mdt[,paste0("experience", 1:splits) := tstrsplit(experience.M, "-", fixed=TRUE)][]
# Changing them to appropriate class
mdt$experience1<-as.numeric(mdt$experience1)
mdt$experience2<-as.numeric(mdt$experience2)
# Checking if the values in experience1 are always lower than experience2,
# such that experience1 can be considered as a lower limit for the job, vice versa for experience2
# summary(mdt$experience1<=mdt$experience2)
# Mode TRUE NA's
# logical 21885 115
# renaming the experience1 and experience2 column into more obvious form
setnames(mdt, c("experience1","experience2"), c("m.experience.L", "m.experience.U"))
mdt[!is.na(m.experience.L),]%>%
melt(., measure.vars = patterns("^m.experience")) %>%
ggvis(~value, fill = ~variable) %>%
group_by(variable) %>%
layer_densities(adjust = 2)
head(mdt$postdate)
[1] "2016-05-21 19:30:00 +0000" "2016-05-21 19:30:00 +0000" "2016-10-13 16:20:55 +0000"
[4] "2016-10-13 16:20:55 +0000" "2016-10-13 16:20:55 +0000" "2016-10-13 16:20:55 +0000"
#str split the postdate based on format corresponds to "2016-05-21 19:30:00 +0000" on "+"
mdt[, c("postdate_time","postdate_timezone") := tstrsplit(postdate,"+",2)]
# summary(as.factor(mdt$postdate_timezone))
# 0000 NA's
# 21977 23
# it seems timezone is pretty meaningless in the data base too
mdt$postdate_timezone <- NULL
max(mdt$postdate_time, na.rm =T)
[1] "2017-01-11 21:00:00 UTC"
# some job listing are listed at multiple location
# "Delhi NCR, Mumbai, Bengaluru, Kochi, Greater Noida, Gurgaon, Hyderabad, Kozhikode, Lucknow"
splits <- max(lengths(strsplit(mdt$joblocation_address, ",")))
# checking
# mdt[lengths(strsplit(mdt$joblocation_address, ","))==splits]
mdt <- mdt[,paste0("locationS", 1:splits) := tstrsplit(joblocation_address, ",", fixed=TRUE)] %>%
melt(mdt, measure.vars = patterns("^locationS"))
dt.location <- mdt[,paste0("location", 1:splits) ][, ID := NULL]
#
# # Sort term_frequency in descending order
# term_frequency<- sort(mdt$location,decreasing = T)
#
# # Plot a barchart of the 10 most common words
# barplot(term_frequency[1:10], col = "tan", las = 2)
require(tm)
require(quanteda)
require(wordcloud)
require(slam) # for row_sums
clean_VC <- VCorpus(VectorSource(mdt$jobdescription)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace) %>%
tm_map(PlainTextDocument) %>%
tm_map(stemDocument, language = "english")
clean_VC.tdm <- TermDocumentMatrix(clean_VC)
# Calculate the rowSums: term_frequency
# Notice that tdm is actually simple-triplet_matrix class,
# which is record as sparse matrix and has its own rowsum function
# class(clean_VC.tdm)
# "TermDocumentMatrix" "simple_triplet_matrix"
term_frequency<-row_sums(clean_VC.tdm)
# Sort term_frequency in descending order
term_frequency<- sort(term_frequency,decreasing = T)
# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10], col = "tan", las = 2)
# Create word_freqs
word_freqs <- data.frame(term = names(term_frequency), num = term_frequency)
head(word_freqs)
wordcloud(word_freqs$term, word_freqs$num, max.words = 100,
random.order=FALSE, rot.per=0.35, colors=brewer.pal(8,"Dark2"))
clean_VC_sparse_r40<-removeSparseTerms(clean_VC.tdm, 0.4)
tdm<-sort(rowSums(as.matrix(clean_VC_sparse_r40)), decreasing = TRUE)
word_freqs <- data.frame(term = names(tdm), num = tdm)
set.seed(142)
wordcloud(word_freqs$term, word_freqs$num, min.freq=20, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
clean_VC_sparse_r10<-removeSparseTerms(clean_VC.tdm, 0.1)
tdm<-as.matrix(clean_VC_sparse_r10)
comparison.cloud(tdm, colors = c("orange", "blue"), max.words = 50)
library(RColorBrewer)
commonality.cloud(tdm, random.order=FALSE, scale=c(5, .5),colors = brewer.pal(4, "Dark2"), max.words=400)
docs <-VC %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace) %>%
tm_map(PlainTextDocument)
myCorpusCopy<- docs ## creating a copy to be used as a dictionary
# docs <- docs %>%
# tm_map(stemDocument) %>%
# tm_map(stemCompletion, dictionary = myCorpusCopy)
docs.tdm<-TermDocumentMatrix(docs)
term_frequency<-row_sums(clean_VC.tdm)
# Sort term_frequency in descending order
term_frequency<- sort(term_frequency,decreasing = T)
# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10], col = "tan", las = 2)
require(wordcloud)
# Create word_freqs
word_freqs <- data.frame(term = names(term_frequency), num = term_frequency)
head(word_freqs)
wordcloud(word_freqs$term, word_freqs$num, max.words = 100,
random.order=FALSE, rot.per=0.35, colors=brewer.pal(8,"Dark2"))
inaugdfm <- dfm(mdt$jobdescription, remove = stopwords("english"),
stem = TRUE, groups=mdt$industry1, ngrams = 2)
inaugdfm <- dfm(mdt$jobdescription)
tdm_trim<-t(dfm_trim(inaugdfm, max_docfreq = .8))
head(tdm_trim)
term_frequency<-row_sums(tdm_trim)
# Sort term_frequency in descending order
term_frequency<- sort(term_frequency,decreasing = T)
word_freqs <- data.frame(term = names(term_frequency), num = term_frequency)
wordcloud(word_freqs$term, word_freqs$num, max.words = 100,
random.order=FALSE, rot.per=0.35, colors=brewer.pal(8,"Dark2"))
docs<-c(mdt$jobdescription, mdt$industry1)
data(data_corpus_inaugural)
data.frame(inaugSpeech = texts(mdt$jobdescription), docvars(as.factor(mdt$industry1)))
myCorpus <- corpus_subset(data.frame(mdt$jobdescription,)
myStemMat <- dfm(myCorpus, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
myStemMat[, 1:5]